Data Cleaning

library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.4     v stringr 1.4.0
## v tidyr   1.1.2     v forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(ROCR)


survey <- read_csv("survey.csv")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_character(),
##   Age = col_double()
## )
## i Use `spec()` for the full column specifications.
summary(survey$Gender)
##    Length     Class      Mode 
##      1259 character character
unique(survey$Gender)
##  [1] "Female"                                        
##  [2] "Male"                                          
##  [3] "M"                                             
##  [4] "F"                                             
##  [5] "male"                                          
##  [6] "m"                                             
##  [7] "A little about you"                            
##  [8] "female"                                        
##  [9] "Guy (-ish) ^_^"                                
## [10] "male leaning androgynous"                      
## [11] "Man"                                           
## [12] "fluid"                                         
## [13] "queer"                                         
## [14] "f"                                             
## [15] "Mal"                                           
## [16] "Malr"                                          
## [17] "something kinda male?"                         
## [18] "Woman"                                         
## [19] "Enby"                                          
## [20] "Androgyne"                                     
## [21] "Agender"                                       
## [22] "Neuter"                                        
## [23] "Cis Man"                                       
## [24] "ostensibly male, unsure what that really means"
## [25] "Male-ish"                                      
## [26] "maile"                                         
## [27] "Trans-female"                                  
## [28] "Cis Female"                                    
## [29] "Cis Male"                                      
## [30] "Male (CIS)"                                    
## [31] "queer/she/they"                                
## [32] "non-binary"                                    
## [33] "Femake"                                        
## [34] "woman"                                         
## [35] "Make"                                          
## [36] "Nah"                                           
## [37] "Genderqueer"                                   
## [38] "cis-female/femme"                              
## [39] "Trans woman"                                   
## [40] "msle"                                          
## [41] "Female (trans)"                                
## [42] "Female (cis)"                                  
## [43] "Mail"                                          
## [44] "cis male"                                      
## [45] "p"                                             
## [46] "femail"                                        
## [47] "All"
survey$Gender[survey$Gender=="Female"|survey$Gender=="Woman"|survey$Gender=="female"|survey$Gender=="f"|survey$Gender=="Femake"|survey$Gender=="woman"|survey$Gender=="Cis Female"|survey$Gender=="Female (cis)"|survey$Gender=="femail"|survey$Gender=="cis-female/femme"|survey$Gender=="Trans woman"|survey$Gender=="Female (trans)"|survey$Gender=="Trans-Female"|survey$Gender=="Trans-female"]<-"F"


survey$Gender[survey$Gender=="m"|survey$Gender=="Guy (-ish) ^_^"|survey$Gender=="Man"|survey$Gender=="Malr"|survey$Gender=="Cis Man"|survey$Gender=="Male-ish"|survey$Gender=="Cis Male"|survey$Gender=="Make"|survey$Gender=="Mail"|survey$Gender=="Male"|survey$Gender=="male"|survey$Gender=="male leaning androgynous"|survey$Gender=="Mal"|survey$Gender=="something kinda male?"|survey$Gender=="ostensibly male, unsure what that really means"|survey$Gender=="maile"|survey$Gender=="Male (CIS)"|survey$Gender=="msle"|survey$Gender=="cis male"]<-"M"

survey$Gender[survey$Gender=="queer"|survey$Gender=="Agender"|survey$Gender=="Nah"|survey$Gender=="Androgyne"|survey$Gender=="fluid"|survey$Gender=="Neuter"|survey$Gender=="queer/she/they"|survey$Gender=="non-binary"|survey$Gender=="Genderqueer"|survey$Gender=="p"|survey$Gender=="All"|survey$Gender=="Agender"|survey$Gender=="Enby"|survey$Gender=="A little about you"]<-"X"

unique(survey$Gender)
## [1] "F" "M" "X"
#getting rid of # of employees and timestamp


#getting rid of people who didn't put M/F
no_x<-survey[survey$Gender!="X",]
no_x$no_employees<-NULL
no_x$Timestamp<-NULL
survey
## # A tibble: 1,259 x 27
##    Timestamp   Age Gender Country state self_employed family_history treatment
##    <chr>     <dbl> <chr>  <chr>   <chr> <chr>         <chr>          <chr>    
##  1 8/27/201~    23 F      Austra~ <NA>  No            Yes            Yes      
##  2 8/27/201~    34 M      Austra~ <NA>  Yes           No             No       
##  3 8/27/201~    25 M      Austra~ <NA>  No            Yes            Yes      
##  4 8/27/201~    22 M      Austra~ <NA>  Yes           Yes            Yes      
##  5 8/27/201~    20 M      Austra~ <NA>  No            No             No       
##  6 8/27/201~    31 M      Austra~ <NA>  No            No             Yes      
##  7 8/27/201~    27 F      Austra~ <NA>  No            Yes            Yes      
##  8 8/27/201~    34 M      Austra~ <NA>  No            No             No       
##  9 8/27/201~    33 M      Austra~ <NA>  No            No             No       
## 10 8/27/201~    27 M      Austra~ <NA>  No            No             Yes      
## # ... with 1,249 more rows, and 19 more variables: work_interfere <chr>,
## #   no_employees <chr>, remote_work <chr>, tech_company <chr>, benefits <chr>,
## #   care_options <chr>, wellness_program <chr>, seek_help <chr>,
## #   anonymity <chr>, leave <chr>, mental_health_consequence <chr>,
## #   phys_health_consequence <chr>, coworkers <chr>, supervisor <chr>,
## #   mental_health_interview <chr>, phys_health_interview <chr>,
## #   mental_vs_physical <chr>, obs_consequence <chr>, comments <chr>
#convert male to 1 female to 0
no_x$Gender[no_x$Gender=="M"]<-1
no_x$Gender[no_x$Gender=="F"]<-0

#convert Yes saught treatment to 1 and No didn't to 0
no_x$treatment[no_x$treatment=="Yes"]<-1
no_x$treatment[no_x$treatment=="No"]<-0

#convert family history Yes to 1 No to 0
no_x$family_history[no_x$family_history=="Yes"]<-1
no_x$family_history[no_x$family_history=="No"]<-0

#convert self employed Yes to 1 No to 0
no_x$self_employed[no_x$self_employed=="Yes"]<-1
no_x$self_employed[no_x$self_employed=="No"]<-0

#convert remote work Yes to 1 No to 0
no_x$remote_work[no_x$remote_work=="Yes"]<-1
no_x$remote_work[no_x$remote_work=="No"]<-0

#convert tech company employed Yes to 1 No to 0
no_x$tech_company[no_x$tech_company=="Yes"]<-1
no_x$tech_company[no_x$tech_company=="No"]<-0

#benefits
no_x$benefits[no_x$benefits=="Yes"]<-1
no_x$benefits[no_x$benefits=="No"]<-0
no_x$benefits[no_x$benefits=="Don't know"]<-2

#care options
no_x$care_options[no_x$care_options=="Yes"]<-1
no_x$care_options[no_x$care_options=="No"]<-0
no_x$care_options[no_x$care_options=="Not sure"]<-2

#wellness program
no_x$wellness_program[no_x$wellness_program=="Yes"]<-1
no_x$wellness_program[no_x$wellness_program=="No"]<-0
no_x$wellness_program[no_x$wellness_program=="Don't know"]<-2

#seek help
no_x$seek_help[no_x$seek_help=="Yes"]<-1
no_x$seek_help[no_x$seek_help=="No"]<-0
no_x$seek_help[no_x$seek_help=="Don't know"]<-2

#anonymity
no_x$anonymity[no_x$anonymity=="Yes"]<-1
no_x$anonymity[no_x$anonymity=="No"]<-0
no_x$anonymity[no_x$anonymity=="Don't know"]<-2

#mental consequence
no_x$mental_health_consequence[no_x$mental_health_consequence=="Yes"]<-1
no_x$mental_health_consequence[no_x$mental_health_consequence=="No"]<-0
no_x$mental_health_consequence[no_x$mental_health_consequence=="Maybe"]<-2

#phys consequence
no_x$phys_health_consequence[no_x$phys_health_consequence=="Yes"]<-1
no_x$phys_health_consequence[no_x$phys_health_consequence=="No"]<-0
no_x$phys_health_consequence[no_x$phys_health_consequence=="Maybe"]<-2

#supervisor
no_x$supervisor[no_x$supervisor=="Yes"]<-1
no_x$supervisor[no_x$supervisor=="No"]<-0
no_x$supervisor[no_x$supervisor=="Some of them"]<-2

#coworkers
no_x$coworkers[no_x$coworkers=="Yes"]<-1
no_x$coworkers[no_x$coworkers=="No"]<-0
no_x$coworkers[no_x$coworkers=="Some of them"]<-2


#mental interview
no_x$mental_health_interview[no_x$mental_health_interview=="Yes"]<-1
no_x$mental_health_interview[no_x$mental_health_interview=="No"]<-0
no_x$mental_health_interview[no_x$mental_health_interview=="Maybe"]<-2

#phys interview
no_x$phys_health_interview[no_x$phys_health_interview=="Yes"]<-1
no_x$phys_health_interview[no_x$phys_health_interview=="No"]<-0
no_x$phys_health_interview[no_x$phys_health_interview=="Maybe"]<-2

#mental vs physical
no_x$mental_vs_physical[no_x$mental_vs_physical=="Yes"]<-1
no_x$mental_vs_physical[no_x$mental_vs_physical=="No"]<-0
no_x$mental_vs_physical[no_x$mental_vs_physical=="Don't know"]<-2

#obs consequence
no_x$obs_consequence[no_x$obs_consequence=="Yes"]<-1
no_x$obs_consequence[no_x$obs_consequence=="No"]<-0
no_x$obs_consequence[no_x$obs_consequence=="Maybe"]<-2

EDA

sum(no_x$treatment==1)/1246
## [1] 0.5016051
sum(no_x$Gender==1)/1246
## [1] 0.7985554
sum(no_x$remote_work==1)/1246
## [1] 0.2985554
sum(no_x$Gender==1)/1246
## [1] 0.7985554
sum(is.na(survey$comments))
## [1] 1096
no_x$work_interfere<-NULL
no_x$leave<-NULL

#uk and us data
uk_us<-filter(no_x,no_x$Country=="United Kingdom"|no_x$Country=="United States")

#uk
us<-filter(no_x,no_x$Country=="United States")

#us
uk<-filter(no_x,no_x$Country=="United Kingdom")

#proportion who sought treatment
sum(uk$treatment==1)/181
## [1] 0.4917127
sum(us$treatment==1)/746
## [1] 0.5442359
#proportion with family history
sum(uk$family_history==1)/181
## [1] 0.3093923
sum(us$family_history==1)/746
## [1] 0.4356568
#proportion with remote work
sum(uk$remote_work==1)/181
## [1] 0.2265193
sum(us$remote_work==1)/746
## [1] 0.3163539
#proportion with care options
sum(uk$care_options==1)/181
## [1] 0.2099448
sum(us$care_options==1)/746
## [1] 0.4115282
#proportion with benefits VERY CLOSE TO HOW MANY SOUGHT TREATMENT IN THE US
sum(uk$benefits==1)/181
## [1] 0.1104972
sum(us$benefits==1)/746
## [1] 0.5268097
#proportion seek help
sum(uk$seek_help==1)/181
## [1] 0.1381215
sum(us$seek_help==1)/746
## [1] 0.2520107
#proportion wellness program
sum(uk$wellness_program==1)/181
## [1] 0.08839779
sum(us$wellness_program==1)/746
## [1] 0.2211796
#proportion wellness program
sum(uk$obs_consequence==1)/181
## [1] 0.1933702
sum(us$obs_consequence==1)/746
## [1] 0.1179625
mean(uk$treatment)
## Warning in mean.default(uk$treatment): argument is not numeric or logical:
## returning NA
## [1] NA
mean(uk$family_history)
## Warning in mean.default(uk$family_history): argument is not numeric or logical:
## returning NA
## [1] NA
ggplot(us)+geom_bar(aes(x=factor(benefits),fill=factor(benefits)))+xlab("Survey Response")+ylab("Number of Respondents")+labs(title="United States' responses to if their workplace offers mental health benefits",fill="Survey Response")+scale_fill_manual(values=
                                                                                                                                                                                                                     c('red','green','yellow'),labels=
                                                                                                                                                                                                                     c("No","Yes","Don't Know"))

ggplot(uk)+geom_bar(aes(x=factor(benefits),fill=factor(benefits)))+xlab("Survey Response")+ylab("Number of Respondents")+labs(title="United Kingdoms' responses to if their workplace offers mental health benefits",fill="Survey Response")+scale_fill_manual(values=
                                                                                                                                                                                                                                                               c('red','green','yellow'),labels=
                                                                                                                                                                                                                                                               c("No","Yes","Don't Know"))

ggplot(us)+geom_bar(aes(x=factor(care_options),fill=factor(care_options)))+xlab("Survey Response")+ylab("Number of Respondents")+labs(title="United States' responses to if they know about the care options provided by their employer",fill="Survey Response")+scale_fill_manual(values=
                                                                                                                                                                                                                                                               c('red','green','yellow'),labels=
                                                                                                                                                                                                                                                               c("No","Yes","Not sure"))

ggplot(uk)+geom_bar(aes(x=factor(care_options),fill=factor(care_options)))+xlab("Survey Response")+ylab("Number of Respondents")+labs(title="United Kingdoms' responses to if they know about the care options provided by their employer",fill="Survey Response")+scale_fill_manual(values=
                                                                                                                                                                                                                                                                 c('red','green','yellow'),labels=
                                                                                                                                                                                                                                                                 c("No","Yes","Not sure"))

ustreatment<-us[us$treatment==1,]
uktreatment<-uk[uk$treatment==1,]
##Plot of Gender of Survey Respondents
ggplot(ustreatment) +geom_bar(aes(x = factor(Gender),fill=factor(Gender))) +ggtitle("Treatment Sought by Gender in the United States")+labs(fill="Gender")+xlab("Gender")+ylab("Number of Respondents")+scale_fill_manual(values=c('pink','blue'),labels=c("Female","Male"))

ggplot(uktreatment) +geom_bar(aes(x = factor(Gender),fill=factor(Gender))) +ggtitle("Treatment Sought by Gender in the United Kingdom")+labs(fill="Gender")+xlab("Gender")+ylab("Number of Respondents")+scale_fill_manual(values=c('pink','blue'),labels=c("Female","Male"))

summary(uk_us)
##       Age              Gender            Country             state          
##  Min.   :-1726.00   Length:927         Length:927         Length:927        
##  1st Qu.:   27.00   Class :character   Class :character   Class :character  
##  Median :   32.00   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :   31.03                                                           
##  3rd Qu.:   37.00                                                           
##  Max.   :  329.00                                                           
##  self_employed      family_history      treatment         remote_work       
##  Length:927         Length:927         Length:927         Length:927        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  tech_company         benefits         care_options       wellness_program  
##  Length:927         Length:927         Length:927         Length:927        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##   seek_help          anonymity         mental_health_consequence
##  Length:927         Length:927         Length:927               
##  Class :character   Class :character   Class :character         
##  Mode  :character   Mode  :character   Mode  :character         
##                                                                 
##                                                                 
##                                                                 
##  phys_health_consequence  coworkers          supervisor       
##  Length:927              Length:927         Length:927        
##  Class :character        Class :character   Class :character  
##  Mode  :character        Mode  :character   Mode  :character  
##                                                               
##                                                               
##                                                               
##  mental_health_interview phys_health_interview mental_vs_physical
##  Length:927              Length:927            Length:927        
##  Class :character        Class :character      Class :character  
##  Mode  :character        Mode  :character      Mode  :character  
##                                                                  
##                                                                  
##                                                                  
##  obs_consequence      comments        
##  Length:927         Length:927        
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
## 
newdf<-as.data.frame(unclass(uk_us))

summary(uk_us)
##       Age              Gender            Country             state          
##  Min.   :-1726.00   Length:927         Length:927         Length:927        
##  1st Qu.:   27.00   Class :character   Class :character   Class :character  
##  Median :   32.00   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :   31.03                                                           
##  3rd Qu.:   37.00                                                           
##  Max.   :  329.00                                                           
##  self_employed      family_history      treatment         remote_work       
##  Length:927         Length:927         Length:927         Length:927        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  tech_company         benefits         care_options       wellness_program  
##  Length:927         Length:927         Length:927         Length:927        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##   seek_help          anonymity         mental_health_consequence
##  Length:927         Length:927         Length:927               
##  Class :character   Class :character   Class :character         
##  Mode  :character   Mode  :character   Mode  :character         
##                                                                 
##                                                                 
##                                                                 
##  phys_health_consequence  coworkers          supervisor       
##  Length:927              Length:927         Length:927        
##  Class :character        Class :character   Class :character  
##  Mode  :character        Mode  :character   Mode  :character  
##                                                               
##                                                               
##                                                               
##  mental_health_interview phys_health_interview mental_vs_physical
##  Length:927              Length:927            Length:927        
##  Class :character        Class :character      Class :character  
##  Mode  :character        Mode  :character      Mode  :character  
##                                                                  
##                                                                  
##                                                                  
##  obs_consequence      comments        
##  Length:927         Length:927        
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
## 
str(uk_us)
## tibble [927 x 23] (S3: tbl_df/tbl/data.frame)
##  $ Age                      : num [1:927] 31 23 37 32 30 24 28 27 38 19 ...
##  $ Gender                   : chr [1:927] "1" "1" "1" "1" ...
##  $ Country                  : chr [1:927] "United Kingdom" "United Kingdom" "United Kingdom" "United Kingdom" ...
##  $ state                    : chr [1:927] NA NA NA NA ...
##  $ self_employed            : chr [1:927] NA NA "0" "0" ...
##  $ family_history           : chr [1:927] "1" "0" "0" "0" ...
##  $ treatment                : chr [1:927] "1" "1" "0" "0" ...
##  $ remote_work              : chr [1:927] "0" "1" "0" "1" ...
##  $ tech_company             : chr [1:927] "1" "1" "1" "1" ...
##  $ benefits                 : chr [1:927] "0" "2" "0" "0" ...
##  $ care_options             : chr [1:927] "1" "0" "0" "0" ...
##  $ wellness_program         : chr [1:927] "0" "2" "0" "0" ...
##  $ seek_help                : chr [1:927] "0" "2" "0" "0" ...
##  $ anonymity                : chr [1:927] "0" "2" "2" "2" ...
##  $ mental_health_consequence: chr [1:927] "1" "2" "1" "1" ...
##  $ phys_health_consequence  : chr [1:927] "1" "0" "2" "1" ...
##  $ coworkers                : chr [1:927] "2" "2" "2" "2" ...
##  $ supervisor               : chr [1:927] "0" "0" "0" "2" ...
##  $ mental_health_interview  : chr [1:927] "2" "2" "0" "0" ...
##  $ phys_health_interview    : chr [1:927] "2" "2" "2" "2" ...
##  $ mental_vs_physical       : chr [1:927] "0" "0" "0" "0" ...
##  $ obs_consequence          : chr [1:927] "1" "0" "0" "0" ...
##  $ comments                 : chr [1:927] NA "My company does provide healthcare but not to me as I'm on a fixed-term contract. The mental healthcare I use i"| __truncated__ NA NA ...
f <- sapply(newdf, is.factor)
f
##                       Age                    Gender                   Country 
##                     FALSE                     FALSE                     FALSE 
##                     state             self_employed            family_history 
##                     FALSE                     FALSE                     FALSE 
##                 treatment               remote_work              tech_company 
##                     FALSE                     FALSE                     FALSE 
##                  benefits              care_options          wellness_program 
##                     FALSE                     FALSE                     FALSE 
##                 seek_help                 anonymity mental_health_consequence 
##                     FALSE                     FALSE                     FALSE 
##   phys_health_consequence                 coworkers                supervisor 
##                     FALSE                     FALSE                     FALSE 
##   mental_health_interview     phys_health_interview        mental_vs_physical 
##                     FALSE                     FALSE                     FALSE 
##           obs_consequence                  comments 
##                     FALSE                     FALSE
# Cleaning data
survey_data <- read.csv('survey2.csv')

# getting rid of the number of employees column
survey_data[,10] <- NULL


# getting rid of all countries except the U.S. and U.K.
survey_data <- survey_data[survey_data$Country == "United States" | survey_data$Country == "United Kingdom", ]
# list of U.S. states included in study
length(unique(survey_data$state))
## [1] 46
# 46 states in the study
states_and_count <- aggregate(data.frame(count = survey_data$state), list(value = survey_data$state), length)
#View(states_and_count)
# CA and WA are the states with the highest # of survey respondents
usstates_and_loc <- read.csv('usstates_count.csv')


# 6 states not included are "AK" "AR" "DE" "HI" "MT" "ND", removing those to combine two data sets



# a lot of use of polygon/leaflet format coming from this webpage: https://rstudio.github.io/leaflet/choropleths.html

states <- 
    geojson_read( 
        x = "https://raw.githubusercontent.com/PublicaMundi/MappingAPI/master/data/geojson/us-states.json"
        , what = "sp"
    )
#View(states$name)
states$count<-usstates_and_loc$count
#View(states)
# leaflet map
m <- leaflet(states) %>%
  setView(-96, 37.8, 4) %>%
  addTiles()
m %>% addPolygons()
bins <- c(0, 1, 10, 20, 50, 100, 200)
pal <- colorBin("YlOrRd", domain = states$count, bins = bins)

labels <- sprintf(
  "<strong>%s</strong><br/>%g survey respondents",
  states$name, states$count
) %>% lapply(htmltools::HTML)

m <- m %>% addPolygons(
  fillColor = ~pal(count),
  weight = 2,
  opacity = 1,
  color = "white",
  dashArray = "3",
  fillOpacity = 0.7,
  highlight = highlightOptions(
    weight = 5,
    color = "#666",
    dashArray = "",
    fillOpacity = 0.7,
    bringToFront = TRUE),
  label = labels,
  labelOptions = labelOptions(
    style = list("font-weight" = "normal", padding = "3px 8px"),
    textsize = "15px",
    direction = "auto")
  )
m %>% addLegend(pal = pal, values = ~count, opacity = 0.7, title = NULL,
  position = "bottomright")
0 – 1
1 – 10
10 – 20
20 – 50
50 – 100
100 – 200
Leaflet | © OpenStreetMap contributors, CC-BY-SA
m

Model Building

# Cleaning data
library(knitr)
library(plotly)

survey_data <- read.csv('survey2.csv')

# getting rid of the number of employees column
survey_data[,10] <- NULL


# getting rid of all countries except the U.S. and U.K.
survey_data <- survey_data[survey_data$Country == "United States" | survey_data$Country == "United Kingdom", ]


survey_data$no_employees<-NULL
survey_data$Timestamp<-NULL
survey_data$comments <- NULL
survey_data$state <- NULL
survey_data$X <- NULL
survey_data00 <- na.omit(survey_data)
survey_data000<-survey_data00[which(survey_data00$Gender!="X"),]
survey_data2 <- survey_data000[1:914,]

survey_data2<-survey_data2[-c(which(survey_data2$Age<18)),]
survey_data2<-survey_data2[-c(which(survey_data2$Age>100)),]

survey_data2 <- survey_data2 %>% mutate(Age = case_when(Age>=70&Age<=79~'6',
                                             Age>=60&Age<=69~'5',
                                             Age>=50&Age<=59~'4',
                                             Age >= 40  & Age <= 49 ~ '3',
                                             Age >= 30  & Age <= 39 ~ '2',
                                             Age >= 20  & Age <= 29 ~ '1',
                                             Age>=10&Age<=19~'0'))

survey_data2$leave[survey_data2$leave == "Very easy"] <- "Easy"
survey_data2$leave[survey_data2$leave == "Somewhat easy"] <- "Easy"
survey_data2$leave[survey_data2$leave == "Very difficult"] <- "Difficult"
survey_data2$leave[survey_data2$leave == "Somewhat difficult"] <- "Difficult"

#View(survey_data2)

#factor(survey_data$self_employed)
#is.factor(survey_data$Age)
#is.factor(survey_data$self_employed)
#survey_data$Age = col_double()


#survey_data2[,2:22] <- lapply(survey_data2[,2:22], factor)
#survey_data[,1] <- lapply(survey_data2[,1], double)


factorized <- lapply(survey_data2[,2:22], factor)
double1 <- as.factor(survey_data2$Age)

library(dplyr)


factorized$Age <- double1

factorized <- as.data.frame(factorized)
#View(factorized)

#splitting to UK and US for two models
UK_rf_input <- subset(factorized, Country == "United Kingdom")
UK_rf_input$Country <- NULL
US_rf_input <- subset(factorized, Country == "United States")
US_rf_input$Country <- NULL

#oversampling the UK data
UK_rf_input<-UK_rf_input[sample(nrow(UK_rf_input),1000,replace=TRUE),]



#oversampling the US data
US_rf_input<-US_rf_input[sample(nrow(US_rf_input),1000,replace=TRUE),]


#finding the base rate for UK and US
sum(UK_rf_input$treatment==1)/1000
## [1] 0
sum(US_rf_input$treatment==1)/1000
## [1] 0
#Hypertuning the mtry parameter for the UK
UK_rf_label<-UK_rf_input[,4]
UK_rf_features<-UK_rf_input[,-4]
set.seed(1950)
#tuneRF(UK_rf_features, UK_rf_label, 4.58,ntreeTry=500, stepFactor=1, improve=0.05,trace=TRUE, plot=TRUE, doBest=FALSE)



#Hypertuning the mtry parameter for the US
US_rf_label<-US_rf_input[,4]
US_rf_features<-US_rf_input[,-4]

#tuneRF(US_rf_features, US_rf_label, 4.58,ntreeTry=500, stepFactor=1, improve=0.05,trace=TRUE, plot=TRUE, doBest=FALSE)


UK_rf_input$treatment <- as.factor(ifelse(UK_rf_input$treatment=="No", "0", "1"))
US_rf_input$treatment <- as.factor(ifelse(US_rf_input$treatment=="No", "0", "1"))
# UK model

# create test and train

# creating sample rows for the test and train set
sample_rows = 1:nrow(UK_rf_input)

# using set seed to make results reproducible
set.seed(1984) 
test_rows = sample(sample_rows,
                   dim(UK_rf_input)[1]*.10, # using 10% of dataset as test rows
                   replace = FALSE)  # to ensure no duplicate samples

# Partition the data between training and test sets 
UK_train = UK_rf_input[-test_rows,]
UK_test = UK_rf_input[test_rows,]

# square root of predictors is 4.58 so mtry level initially will be 4.58
set.seed(1950)    
                            
UK_initial_RF = randomForest(treatment~.,       
                            UK_train, 
                            ntree = 500,        
                            mtry = 12,        
                            replace = TRUE, 
                            sampsize = 100, 
                            nodesize = 5,  
                            importance = TRUE,   
                            proximity = FALSE,  
                            norm.votes = TRUE,  
                            do.trace = FALSE,   
                            keep.forest = TRUE,
                            keep.inbag = TRUE)

kable(UK_initial_RF$confusion)
0 1 class.error
0 416 31 0.0693512
1 71 382 0.1567329
UK_RF_acc = sum(UK_initial_RF$confusion[row(UK_initial_RF$confusion) == col(UK_initial_RF$confusion)]) / sum(UK_initial_RF$confusion)
UK_RF_acc
## [1] 0.886444
ukpred<-data.frame(UK_initial_RF$predicted,UK_train)
ukpredlist<-as.list(UK_initial_RF$predicted)
labellist<-as.list(UK_train[,4])

UK_RF_2_prediction = as.data.frame(as.numeric(as.character(UK_initial_RF$votes[,2])))

#View(UK_train)
UK_train_actual = data.frame(UK_train[,4])

# ROC AUC
UK_prediction_comparison = prediction(UK_RF_2_prediction, UK_train_actual)

#View(UK_prediction_comparison)

# Create a performance object for ROC curve where:
# tpr = true positive rate.
# fpr = fale positive rate.
UK_pred_performance = performance(UK_prediction_comparison, 
                                         measure = "tpr",    #<- performance measure to use for the evaluation
                                         x.measure = "fpr")  #<- 2nd performance measure to use for the evaluation
#View(UK_pred_performance)

# Here is what the performance() function does with the outputs of the
# prediction() function.
UK_rates = data.frame(fp = UK_prediction_comparison@fp,  #<- false positive classification.
                             tp = UK_prediction_comparison@tp,  #<- true positive classification.
                             tn = UK_prediction_comparison@tn,  #<- true negative classification.
                             fn = UK_prediction_comparison@fn)  #<- false negative classification.

colnames(UK_rates) = c("fp", "tp", "tn", "fn")

#View(UK_rates)

# As the rows go down the number of remaining unclassified items in the set decreases.
# The first row is the starting point with the initial counts of the positive and 
# negative value, that's why R adds an extra row to the output .

# Now let's calculate the true positive and false positive rates for the classification.
#str(UK_rates)
tpr = UK_rates$tp / (UK_rates$tp + UK_rates$fn)
fpr = UK_rates$fp / (UK_rates$fp + UK_rates$tn)

# Compare the values with the output of the performance() function, they are the same.
UK_rates_comparison = data.frame(UK_pred_performance@x.values,
                                        UK_pred_performance@y.values,
                                        fpr,
                                        tpr)
colnames(UK_rates_comparison) = c("x.values","y.values","fpr","tpr") #<- rename columns accordingly.
#View(UK_rates_comparison)

#dev.off() 
while (!is.null(dev.list())) dev.off()

plot(UK_pred_performance, 
     col = "orange", 
     lwd = 3, 
     main = "ROC curve")
grid(col = "black")


# Add a 45 degree line.
abline(a = 0, 
       b = 1,
       lwd = 2,
       lty = 2,
       col = "blue")

# Calculate the area under curve (AUC), which can help you compare the 
# ROC curves of different models for their relative accuracy.
UK_auc_RF = performance(UK_prediction_comparison, 
                               "auc")@y.values[[1]]
UK_auc_RF
## [1] 0.9617662
# Add the AUC value to the ROC plot.
text(x = 0.5, 
     y = 0.5, 
     labels = paste0("AUC = ", 
                     round(UK_auc_RF,
                           2)))

kable(UK_initial_RF$confusion)
0 1 class.error
0 416 31 0.0693512
1 71 382 0.1567329
UK_RF_acc = sum(UK_initial_RF$confusion[row(UK_initial_RF$confusion) == col(UK_initial_RF$confusion)]) / sum(UK_initial_RF$confusion)
UK_RF_acc
## [1] 0.886444
confusionMatrix(as.factor(UK_initial_RF$predicted), as.factor(UK_train$treatment), positive = "1", dnn=c("Prediction", "Actual"), mode = "everything")
## Confusion Matrix and Statistics
## 
##           Actual
## Prediction   0   1
##          0 416  71
##          1  31 382
##                                           
##                Accuracy : 0.8867          
##                  95% CI : (0.8641, 0.9066)
##     No Information Rate : 0.5033          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7735          
##                                           
##  Mcnemar's Test P-Value : 0.0001127       
##                                           
##             Sensitivity : 0.8433          
##             Specificity : 0.9306          
##          Pos Pred Value : 0.9249          
##          Neg Pred Value : 0.8542          
##               Precision : 0.9249          
##                  Recall : 0.8433          
##                      F1 : 0.8822          
##              Prevalence : 0.5033          
##          Detection Rate : 0.4244          
##    Detection Prevalence : 0.4589          
##       Balanced Accuracy : 0.8870          
##                                           
##        'Positive' Class : 1               
## 
#finding the error rate, hit rate 
UK_double_original<-as.double(UK_rf_input[,4])
UK_double_predictions<-as.double(UK_initial_RF$predicted)
UK_residuals<-as.data.frame(UK_double_predictions-UK_double_original)
## Warning in UK_double_predictions - UK_double_original: longer object length is
## not a multiple of shorter object length
sum(UK_double_predictions-UK_double_original)/1000
## Warning in UK_double_predictions - UK_double_original: longer object length is
## not a multiple of shorter object length
## [1] -0.044
# most important var
kable(as.data.frame(importance(UK_initial_RF, type = 2, scale = TRUE)))
MeanDecreaseGini
Gender 1.109131
self_employed 1.639575
family_history 2.943197
remote_work 1.126799
tech_company 1.114462
benefits 2.429243
care_options 2.612736
wellness_program 1.276998
seek_help 1.679600
anonymity 2.064385
leave 2.348194
mental_health_consequence 2.337719
phys_health_consequence 1.229083
coworkers 2.983042
supervisor 2.011853
mental_health_interview 1.271313
phys_health_interview 2.293574
mental_vs_physical 2.921869
obs_consequence 1.483449
Age 3.357769
importance_colmn <- as.data.frame(UK_initial_RF$importance)
#varImpPlot(UK_initial_RF,type=2)
feat_imp_df <- importance(UK_initial_RF) %>% 
    data.frame() %>% 
    mutate(feature = row.names(.)) 

  # plot dataframe
  ggplot(feat_imp_df, aes(x = reorder(feature, MeanDecreaseGini), 
                         y = MeanDecreaseGini)) +
    geom_bar(stat='identity') +
    coord_flip() +
    theme_classic() +
    labs(
      x     = "Feature",
      y     = "Importance",
      title = "Mean Decrease Gini Plot U.K."
    )

UK_RF_error = data.frame(1:nrow(UK_initial_RF$err.rate),
                                UK_initial_RF$err.rate)

#View(UK_RF_error)

colnames(UK_RF_error) = c("Number of Trees", "OutBoxerror",
                                 "No", "Yes")

# Add another variable that measures the difference between the error rates, in
# some situations we would want to minimize this but need to use caution because
# it could be that the differences are small but that both errors are really high,
# just another point to track. 

UK_RF_error$Diff <- UK_RF_error$Yes-UK_RF_error$No


fig_UK <- plot_ly(x = UK_RF_error$`Number of Trees`, y=UK_RF_error$Diff,name="Diff", type = 'scatter', mode = 'lines')
fig_UK <- fig_UK %>% add_trace(y=UK_RF_error$OutBoxerror, name="O.O.B error")
fig_UK <- fig_UK %>% add_trace(y=UK_RF_error$No, name="No")
fig_UK <- fig_UK %>% add_trace(y=UK_RF_error$Yes, name="Yes")

fig_UK
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
1002003004005000.050.10.150.20.250.30.35
DiffO.O.B errorNoYes
# OOB error converges to 0.4
#classifying sought treatment better than not
kable(UK_initial_RF$confusion)
0 1 class.error
0 416 31 0.0693512
1 71 382 0.1567329
UK_RF_acc = sum(UK_initial_RF$confusion[row(UK_initial_RF$confusion) == col(UK_initial_RF$confusion)]) / sum(UK_initial_RF$confusion)
UK_RF_acc
## [1] 0.886444
kable(head(UK_RF_error[order(UK_RF_error$OutBoxerror, decreasing = FALSE),], n=5))
Number of Trees OutBoxerror No Yes Diff
495 495 0.1122222 0.0671141 0.1567329 0.0896188
496 496 0.1122222 0.0671141 0.1567329 0.0896188
497 497 0.1122222 0.0671141 0.1567329 0.0896188
498 498 0.1122222 0.0671141 0.1567329 0.0896188
499 499 0.1122222 0.0671141 0.1567329 0.0896188
# the minimum out of the box error is 0.1555707, the minimum number of trees at this min out of box error is 282
# creating sample rows for the test and train set
sample_rows = 1:nrow(US_rf_input)

# using set seed to make results reproducible
set.seed(1984) 
test_rows = sample(sample_rows,
                   dim(US_rf_input)[1]*.10, # using 30% of dataset as test rows
                   replace = FALSE)  # to ensure no duplicate samples

# Partition the data between training and test sets 
US_train = US_rf_input[-test_rows,]
US_test = US_rf_input[test_rows,]

# square root of predictors is 4.58 so mtry level initially will be 4.58
set.seed(1950)    
                            
US_initial_RF = randomForest(treatment~.,       
                            US_train, 
                            ntree = 500,        
                            mtry = 15,        
                            replace = TRUE, 
                            sampsize = 100, 
                            nodesize = 5,  
                            importance = TRUE,   
                            proximity = FALSE,  
                            norm.votes = TRUE,  
                            do.trace = FALSE,   
                            keep.forest = TRUE,
                            keep.inbag = TRUE)
kable(US_initial_RF$confusion)
0 1 class.error
0 266 145 0.3527981
1 70 419 0.1431493
US_RF_acc = sum(US_initial_RF$confusion[row(US_initial_RF$confusion) == col(US_initial_RF$confusion)]) / sum(US_initial_RF$confusion)
US_RF_acc
## [1] 0.7606919
US_RF_2_prediction = as.data.frame(as.numeric(as.character(US_initial_RF$votes[,2])))

#View(UK_train)
US_train_actual = data.frame(US_train[,4])

# ROC AUC
US_prediction_comparison = prediction(US_RF_2_prediction, US_train_actual)

#View(UK_prediction_comparison)

# Create a performance object for ROC curve where:
# tpr = true positive rate.
# fpr = fale positive rate.
US_pred_performance = performance(US_prediction_comparison, 
                                         measure = "tpr",    #<- performance measure to use for the evaluation
                                         x.measure = "fpr")  #<- 2nd performance measure to use for the evaluation
#View(UK_pred_performance)

# Here is what the performance() function does with the outputs of the
# prediction() function.
US_rates = data.frame(fp = US_prediction_comparison@fp,  #<- false positive classification.
                             tp = US_prediction_comparison@tp,  #<- true positive classification.
                             tn = US_prediction_comparison@tn,  #<- true negative classification.
                             fn = US_prediction_comparison@fn)  #<- false negative classification.

colnames(US_rates) = c("fp", "tp", "tn", "fn")

#View(UK_rates)

# As the rows go down the number of remaining unclassified items in the set decreases.
# The first row is the starting point with the initial counts of the positive and 
# negative value, that's why R adds an extra row to the output .

# Now let's calculate the true positive and false positive rates for the classification.
#str(UK_rates)
tpr = US_rates$tp / (US_rates$tp + US_rates$fn)
fpr = US_rates$fp / (US_rates$fp + US_rates$tn)

# Compare the values with the output of the performance() function, they are the same.
US_rates_comparison = data.frame(US_pred_performance@x.values,
                                        US_pred_performance@y.values,
                                        fpr,
                                        tpr)
colnames(US_rates_comparison) = c("x.values","y.values","fpr","tpr") #<- rename columns accordingly.
#View(UK_rates_comparison)

#dev.off() 
while (!is.null(dev.list())) dev.off()

plot(US_pred_performance, 
     col = "orange", 
     lwd = 3, 
     main = "ROC curve")
grid(col = "black")


# Add a 45 degree line.
abline(a = 0, 
       b = 1,
       lwd = 2,
       lty = 2,
       col = "blue")

# Calculate the area under curve (AUC), which can help you compare the 
# ROC curves of different models for their relative accuracy.
US_auc_RF = performance(US_prediction_comparison, 
                               "auc")@y.values[[1]]
US_auc_RF
## [1] 0.8515118
# Add the AUC value to the ROC plot.
text(x = 0.5, 
     y = 0.5, 
     labels = paste0("AUC = ", 
                     round(US_auc_RF,
                           2)))
#finding the error rate, hit rate 
US_double_original<-as.double(US_rf_input[,4])
US_double_predictions<-as.double(US_initial_RF$predicted)
US_residuals<-as.data.frame(ifelse(US_double_predictions-US_double_original==0,0,1))
## Warning in US_double_predictions - US_double_original: longer object length is
## not a multiple of shorter object length
1-sum(US_residuals)/1000
## [1] 0.486
sum(US_residuals==0)
## [1] 486
UK_double_original<-as.double(UK_rf_input[,4])
UK_double_predictions<-as.double(UK_initial_RF$predicted)
UK_residuals<-as.data.frame(ifelse(UK_double_predictions-UK_double_original==0,0,1))
## Warning in UK_double_predictions - UK_double_original: longer object length is
## not a multiple of shorter object length
1-sum(UK_residuals)/1000
## [1] 0.532
sum(UK_residuals==0)
## [1] 532
# most important var
kable(as.data.frame(importance(US_initial_RF, type = 2, scale = TRUE)))
MeanDecreaseGini
Gender 0.8894703
self_employed 0.5259106
family_history 6.9679091
remote_work 0.9321551
tech_company 0.8632699
benefits 1.7220652
care_options 5.2530838
wellness_program 1.3839687
seek_help 1.7950005
anonymity 1.4661827
leave 2.4058954
mental_health_consequence 2.1110134
phys_health_consequence 1.2674832
coworkers 1.5316040
supervisor 1.8755152
mental_health_interview 0.9744162
phys_health_interview 2.0551686
mental_vs_physical 1.6707267
obs_consequence 0.8177320
Age 3.2120433
importance_colmn <- as.data.frame(US_initial_RF$importance)
#varImpPlot(US_initial_RF,type=2)
feat_imp_df <- importance(US_initial_RF) %>% 
    data.frame() %>% 
    mutate(feature = row.names(.)) 

  # plot dataframe
  ggplot(feat_imp_df, aes(x = reorder(feature, MeanDecreaseGini), 
                         y = MeanDecreaseGini)) +
    geom_bar(stat='identity') +
    coord_flip() +
    theme_classic() +
    labs(
      x     = "Feature",
      y     = "Importance",
      title = "Mean Decrease Gini Plot U.S."
    )

US_RF_error = data.frame(1:nrow(US_initial_RF$err.rate),
                                US_initial_RF$err.rate)

#View(UK_RF_error)

colnames(US_RF_error) = c("Number of Trees", "OutBoxerror",
                                 "No", "Yes")

# Add another variable that measures the difference between the error rates, in
# some situations we would want to minimize this but need to use caution because
# it could be that the differences are small but that both errors are really high,
# just another point to track. 

US_RF_error$Diff <- US_RF_error$Yes-US_RF_error$No


fig_US <- plot_ly(x = US_RF_error$`Number of Trees`, y=US_RF_error$Diff,name="Diff", type = 'scatter', mode = 'lines')
fig_US <- fig_US %>% add_trace(y=US_RF_error$OutBoxerror, name="O.O.B error")
fig_US <- fig_US %>% add_trace(y=US_RF_error$No, name="No")
fig_US <- fig_US %>% add_trace(y=US_RF_error$Yes, name="Yes")

fig_US
100200300400500−0.2−0.100.10.20.3
DiffO.O.B errorNoYes
kable(US_initial_RF$confusion)
0 1 class.error
0 266 145 0.3527981
1 70 419 0.1431493
US_RF_acc = sum(US_initial_RF$confusion[row(US_initial_RF$confusion) == col(US_initial_RF$confusion)]) / sum(US_initial_RF$confusion)
US_RF_acc
## [1] 0.7606919
confusionMatrix(as.factor(US_initial_RF$predicted), as.factor(US_train$treatment), positive = "1", dnn=c("Prediction", "Actual"), mode = "everything")
## Confusion Matrix and Statistics
## 
##           Actual
## Prediction   0   1
##          0 266  70
##          1 145 419
##                                           
##                Accuracy : 0.7611          
##                  95% CI : (0.7319, 0.7886)
##     No Information Rate : 0.5433          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5115          
##                                           
##  Mcnemar's Test P-Value : 4.494e-07       
##                                           
##             Sensitivity : 0.8569          
##             Specificity : 0.6472          
##          Pos Pred Value : 0.7429          
##          Neg Pred Value : 0.7917          
##               Precision : 0.7429          
##                  Recall : 0.8569          
##                      F1 : 0.7958          
##              Prevalence : 0.5433          
##          Detection Rate : 0.4656          
##    Detection Prevalence : 0.6267          
##       Balanced Accuracy : 0.7520          
##                                           
##        'Positive' Class : 1               
## 
kable(head(US_RF_error[order(US_RF_error$OutBoxerror, decreasing = FALSE),], n=5))
Number of Trees OutBoxerror No Yes Diff
46 46 0.2177778 0.2895377 0.1574642 -0.1320735
47 47 0.2188889 0.2895377 0.1595092 -0.1300285
48 48 0.2188889 0.2919708 0.1574642 -0.1345066
51 51 0.2211111 0.2944039 0.1595092 -0.1348947
52 52 0.2211111 0.2944039 0.1595092 -0.1348947
# the minimum out of the box error is 0.1555707, the minimum number of trees at this min out of box error is 282

Model Comparison

prop.test(x = c(506,514), n = c(1000, 1000),
           alternative = "two.sided",correct=FALSE)
## 
##  2-sample test for equality of proportions without continuity
##  correction
## 
## data:  c(506, 514) out of c(1000, 1000)
## X-squared = 0.12805, df = 1, p-value = 0.7205
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  -0.05181596  0.03581596
## sample estimates:
## prop 1 prop 2 
##  0.506  0.514
Drag to outliner or Upload
Close